{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "09172ef7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import lightgbm as lgb\n",
    "from tqdm import tqdm\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "55cc78fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = pd.read_csv('./video_predict_data/main_vv_seq_train_final.csv') #观看数据序列信息\n",
    "vid_info = pd.read_csv('./video_predict_data/vid_info.csv')           #视频信息介绍\n",
    "candidate_items = pd.read_csv('./video_predict_data/candidate_items_B.csv') #候选集\n",
    "all_data['rank'] = all_data.groupby('did').cumcount()+1  # 用于表示数据集\n",
    "all_data['vid_next'] = all_data.groupby(['did']).vid.shift(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3ac13a42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>did</th>\n",
       "      <th>vid</th>\n",
       "      <th>vts</th>\n",
       "      <th>hb</th>\n",
       "      <th>seq_no</th>\n",
       "      <th>cpn</th>\n",
       "      <th>fpn</th>\n",
       "      <th>time_gap</th>\n",
       "      <th>rank</th>\n",
       "      <th>vid_next</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>82351</td>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>105.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>9</td>\n",
       "      <td>68</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2696555</td>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>815a870db0d94166e330ae97f7068de2</td>\n",
       "      <td>3300.0</td>\n",
       "      <td>6356.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1</td>\n",
       "      <td>130</td>\n",
       "      <td>4625.0</td>\n",
       "      <td>2</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2696556</td>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>449a1829a8742e652cb39d6ae7523df1</td>\n",
       "      <td>1620.0</td>\n",
       "      <td>1983.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1</td>\n",
       "      <td>130</td>\n",
       "      <td>3103.0</td>\n",
       "      <td>3</td>\n",
       "      <td>815a870db0d94166e330ae97f7068de2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                               did  \\\n",
       "0       82351  0000d0aabe8c188f88c756ce0f7f9639   \n",
       "1     2696555  0000d0aabe8c188f88c756ce0f7f9639   \n",
       "2     2696556  0000d0aabe8c188f88c756ce0f7f9639   \n",
       "\n",
       "                                vid     vts      hb  seq_no  cpn  fpn  \\\n",
       "0  31c50bb6df1a3087c29f43d9cfda0197   105.0    43.0     7.0    9   68   \n",
       "1  815a870db0d94166e330ae97f7068de2  3300.0  6356.0     6.0    1  130   \n",
       "2  449a1829a8742e652cb39d6ae7523df1  1620.0  1983.0     5.0    1  130   \n",
       "\n",
       "   time_gap  rank                          vid_next  \n",
       "0   11636.0     1                               NaN  \n",
       "1    4625.0     2  31c50bb6df1a3087c29f43d9cfda0197  \n",
       "2    3103.0     3  815a870db0d94166e330ae97f7068de2  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "624d19ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "#划分数据集\n",
    "train_origin_data = all_data[all_data['rank']>=2]\n",
    "test_origin_data = all_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "25b9b5bc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "5425504it [04:28, 20232.84it/s]\n"
     ]
    }
   ],
   "source": [
    "# 存入dict中，加载bert特征\n",
    "train_did_seq_fea={}\n",
    "train_did_seq_len={}\n",
    "train_did_seq_pd=train_origin_data[['did','vid']]\n",
    "for i,r in tqdm(train_did_seq_pd.iterrows()):\n",
    "    if r[0] not in train_did_seq_fea:\n",
    "        train_did_seq_fea[r[0]]=str(r[1])\n",
    "        train_did_seq_len[r[0]]=0\n",
    "    else:\n",
    "        if train_did_seq_len[r[0]]<10:\n",
    "            train_did_seq_len[r[0]]+=1\n",
    "            train_did_seq_fea[r[0]]=str(r[1])+' '+train_did_seq_fea[r[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "29f9ba1d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'f87cf2ad695b4bb5dd830ae40bf29475 fde2b2a62fb6061e4a958fb0b78c0293 2c47a9311f2f19d6b33670715cdc544d bee6becd984e6486215f7acd876b5d26 449a1829a8742e652cb39d6ae7523df1'"
      ]
     },
     "execution_count": 98,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_did_seq_fea['0000d0aabe8c188f88c756ce0f7f9639']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "d704c604",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 170909/170909 [32:47<00:00, 86.87it/s]\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import json\n",
    "from collections import defaultdict\n",
    "import time\n",
    "host = \"http://172.70.10.21:8009/\"\n",
    "endpoint = \"get_feature\"\n",
    "url = ''.join([host, endpoint])\n",
    "train_did_seq_fea_bert=defaultdict(list)\n",
    "for i in tqdm(train_did_seq_fea):\n",
    "    data={\"query\":train_did_seq_fea[i]}\n",
    "    r = requests.post(url, data=json.dumps(data))\n",
    "    train_did_seq_fea_bert[i]=json.loads(r.text)['data']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "e4f52a3a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 12325/12325 [02:12<00:00, 93.01it/s]\n"
     ]
    }
   ],
   "source": [
    "cand=candidate_items['vid']\n",
    "host = \"http://172.70.10.21:8009/\"\n",
    "endpoint = \"get_feature\"\n",
    "url = ''.join([host, endpoint])\n",
    "train_did_seq_fea_bert=defaultdict(list)\n",
    "vid_bert_em={}\n",
    "for i in tqdm(cand):\n",
    "    data={\"query\":i}\n",
    "    r = requests.post(url, data=json.dumps(data))\n",
    "    vid_bert_em[i]=json.loads(r.text)['data']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "id": "699a7ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in vid_bert_em:\n",
    "    vid_bert_em[i].append(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "id": "58ffad2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "vid_bert_em_pd=pd.DataFrame(vid_bert_em).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "id": "2632b4ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "vid_bert_em_pd=vid_bert_em_pd.drop(labels=73,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "id": "df20c622",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>63</th>\n",
       "      <th>64</th>\n",
       "      <th>65</th>\n",
       "      <th>66</th>\n",
       "      <th>67</th>\n",
       "      <th>68</th>\n",
       "      <th>69</th>\n",
       "      <th>70</th>\n",
       "      <th>71</th>\n",
       "      <th>72</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4e6091b8553e6fb25c867cdc7d5608d9</th>\n",
       "      <td>0.224221</td>\n",
       "      <td>0.163928</td>\n",
       "      <td>-0.096474</td>\n",
       "      <td>-0.313437</td>\n",
       "      <td>-0.032228</td>\n",
       "      <td>0.073984</td>\n",
       "      <td>-0.036641</td>\n",
       "      <td>-0.074896</td>\n",
       "      <td>0.079085</td>\n",
       "      <td>-0.028804</td>\n",
       "      <td>...</td>\n",
       "      <td>0.254407</td>\n",
       "      <td>-0.150296</td>\n",
       "      <td>0.124841</td>\n",
       "      <td>0.016994</td>\n",
       "      <td>0.197462</td>\n",
       "      <td>0.25678</td>\n",
       "      <td>-0.019747</td>\n",
       "      <td>0.063751</td>\n",
       "      <td>-0.162092</td>\n",
       "      <td>4e6091b8553e6fb25c867cdc7d5608d9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3ac2ed6c901a8c1f61aaccd5f535610e</th>\n",
       "      <td>-0.046063</td>\n",
       "      <td>-0.060179</td>\n",
       "      <td>0.122003</td>\n",
       "      <td>-0.290268</td>\n",
       "      <td>-0.051804</td>\n",
       "      <td>-0.051109</td>\n",
       "      <td>0.044799</td>\n",
       "      <td>0.093681</td>\n",
       "      <td>0.117761</td>\n",
       "      <td>-0.113826</td>\n",
       "      <td>...</td>\n",
       "      <td>0.042109</td>\n",
       "      <td>-0.098792</td>\n",
       "      <td>0.247691</td>\n",
       "      <td>-0.036003</td>\n",
       "      <td>0.248261</td>\n",
       "      <td>0.196554</td>\n",
       "      <td>-0.00164</td>\n",
       "      <td>-0.17506</td>\n",
       "      <td>-0.025497</td>\n",
       "      <td>3ac2ed6c901a8c1f61aaccd5f535610e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a06f451ac97e416a3e23dc83d779b450</th>\n",
       "      <td>-0.088922</td>\n",
       "      <td>0.0953</td>\n",
       "      <td>0.044043</td>\n",
       "      <td>-0.281063</td>\n",
       "      <td>-0.086201</td>\n",
       "      <td>-0.081579</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>-0.019975</td>\n",
       "      <td>-0.073882</td>\n",
       "      <td>0.073419</td>\n",
       "      <td>...</td>\n",
       "      <td>0.103352</td>\n",
       "      <td>-0.03589</td>\n",
       "      <td>0.182418</td>\n",
       "      <td>-0.146269</td>\n",
       "      <td>-0.009281</td>\n",
       "      <td>0.258288</td>\n",
       "      <td>-0.056532</td>\n",
       "      <td>0.179102</td>\n",
       "      <td>-0.105327</td>\n",
       "      <td>a06f451ac97e416a3e23dc83d779b450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aed3813753bb154c31ba0baa5755fa52</th>\n",
       "      <td>0.265366</td>\n",
       "      <td>0.069603</td>\n",
       "      <td>-0.116056</td>\n",
       "      <td>-0.064341</td>\n",
       "      <td>-0.223334</td>\n",
       "      <td>-0.139857</td>\n",
       "      <td>-0.117373</td>\n",
       "      <td>-0.003364</td>\n",
       "      <td>0.165045</td>\n",
       "      <td>0.138165</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.006653</td>\n",
       "      <td>-0.272331</td>\n",
       "      <td>0.271812</td>\n",
       "      <td>0.065179</td>\n",
       "      <td>0.224623</td>\n",
       "      <td>0.079269</td>\n",
       "      <td>-0.016956</td>\n",
       "      <td>-0.02744</td>\n",
       "      <td>-0.155299</td>\n",
       "      <td>aed3813753bb154c31ba0baa5755fa52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50999680985c2e4b8e1ca38a23496571</th>\n",
       "      <td>-0.085978</td>\n",
       "      <td>0.072255</td>\n",
       "      <td>-0.038656</td>\n",
       "      <td>-0.039911</td>\n",
       "      <td>-0.197762</td>\n",
       "      <td>0.008704</td>\n",
       "      <td>-0.178474</td>\n",
       "      <td>0.028002</td>\n",
       "      <td>0.031365</td>\n",
       "      <td>-0.051818</td>\n",
       "      <td>...</td>\n",
       "      <td>0.166202</td>\n",
       "      <td>-0.143758</td>\n",
       "      <td>0.170843</td>\n",
       "      <td>-0.087539</td>\n",
       "      <td>0.126389</td>\n",
       "      <td>0.093644</td>\n",
       "      <td>-0.10491</td>\n",
       "      <td>-0.160005</td>\n",
       "      <td>0.017439</td>\n",
       "      <td>50999680985c2e4b8e1ca38a23496571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c6716ba9e95a880fa57041ec6f2cb8e3</th>\n",
       "      <td>-0.024419</td>\n",
       "      <td>0.187523</td>\n",
       "      <td>-0.039731</td>\n",
       "      <td>-0.151485</td>\n",
       "      <td>-0.138058</td>\n",
       "      <td>-0.066764</td>\n",
       "      <td>0.025493</td>\n",
       "      <td>0.074366</td>\n",
       "      <td>0.194684</td>\n",
       "      <td>0.019213</td>\n",
       "      <td>...</td>\n",
       "      <td>0.143771</td>\n",
       "      <td>-0.197254</td>\n",
       "      <td>0.049204</td>\n",
       "      <td>-0.064431</td>\n",
       "      <td>-0.0918</td>\n",
       "      <td>0.059095</td>\n",
       "      <td>0.18631</td>\n",
       "      <td>-0.051492</td>\n",
       "      <td>-0.171931</td>\n",
       "      <td>c6716ba9e95a880fa57041ec6f2cb8e3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>730890cd34ac161117a612deee0921ac</th>\n",
       "      <td>0.103679</td>\n",
       "      <td>-0.204068</td>\n",
       "      <td>-0.108106</td>\n",
       "      <td>-0.293693</td>\n",
       "      <td>0.074555</td>\n",
       "      <td>-0.235991</td>\n",
       "      <td>-0.049014</td>\n",
       "      <td>0.029087</td>\n",
       "      <td>0.175783</td>\n",
       "      <td>-0.071693</td>\n",
       "      <td>...</td>\n",
       "      <td>0.117723</td>\n",
       "      <td>-0.13412</td>\n",
       "      <td>0.142837</td>\n",
       "      <td>0.074518</td>\n",
       "      <td>0.117027</td>\n",
       "      <td>-0.001979</td>\n",
       "      <td>0.049145</td>\n",
       "      <td>0.240171</td>\n",
       "      <td>-0.20419</td>\n",
       "      <td>730890cd34ac161117a612deee0921ac</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57e5e7fcd9287ddaf5195e6e0230e01a</th>\n",
       "      <td>0.047227</td>\n",
       "      <td>0.046551</td>\n",
       "      <td>-0.085019</td>\n",
       "      <td>-0.325037</td>\n",
       "      <td>-0.049108</td>\n",
       "      <td>0.103695</td>\n",
       "      <td>0.029109</td>\n",
       "      <td>-0.009595</td>\n",
       "      <td>0.276551</td>\n",
       "      <td>-0.176007</td>\n",
       "      <td>...</td>\n",
       "      <td>0.233791</td>\n",
       "      <td>-0.082447</td>\n",
       "      <td>0.042312</td>\n",
       "      <td>-0.025665</td>\n",
       "      <td>0.28851</td>\n",
       "      <td>0.231893</td>\n",
       "      <td>-0.047362</td>\n",
       "      <td>0.218689</td>\n",
       "      <td>-0.194203</td>\n",
       "      <td>57e5e7fcd9287ddaf5195e6e0230e01a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96127de9846eb55b3fe4ccedb23ded4b</th>\n",
       "      <td>0.178825</td>\n",
       "      <td>0.092693</td>\n",
       "      <td>-0.147028</td>\n",
       "      <td>-0.178634</td>\n",
       "      <td>-0.116189</td>\n",
       "      <td>0.010839</td>\n",
       "      <td>-0.285221</td>\n",
       "      <td>-0.074372</td>\n",
       "      <td>0.054715</td>\n",
       "      <td>-0.161686</td>\n",
       "      <td>...</td>\n",
       "      <td>0.204444</td>\n",
       "      <td>-0.202406</td>\n",
       "      <td>-0.010553</td>\n",
       "      <td>-0.177014</td>\n",
       "      <td>0.221002</td>\n",
       "      <td>0.279636</td>\n",
       "      <td>-0.006578</td>\n",
       "      <td>0.110608</td>\n",
       "      <td>-0.174116</td>\n",
       "      <td>96127de9846eb55b3fe4ccedb23ded4b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>eda99244ee59adc030fa832298eb3fc2</th>\n",
       "      <td>-0.014867</td>\n",
       "      <td>0.29465</td>\n",
       "      <td>-0.030907</td>\n",
       "      <td>-0.344912</td>\n",
       "      <td>-0.181097</td>\n",
       "      <td>-0.017894</td>\n",
       "      <td>0.033809</td>\n",
       "      <td>-0.10483</td>\n",
       "      <td>-0.078909</td>\n",
       "      <td>-0.224345</td>\n",
       "      <td>...</td>\n",
       "      <td>0.160177</td>\n",
       "      <td>-0.073975</td>\n",
       "      <td>0.349831</td>\n",
       "      <td>0.095981</td>\n",
       "      <td>0.10786</td>\n",
       "      <td>0.216804</td>\n",
       "      <td>0.081045</td>\n",
       "      <td>-0.092526</td>\n",
       "      <td>-0.149761</td>\n",
       "      <td>eda99244ee59adc030fa832298eb3fc2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12325 rows × 73 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        0         1         2         3   \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9  0.224221  0.163928 -0.096474 -0.313437   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e -0.046063 -0.060179  0.122003 -0.290268   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.088922    0.0953  0.044043 -0.281063   \n",
       "aed3813753bb154c31ba0baa5755fa52  0.265366  0.069603 -0.116056 -0.064341   \n",
       "50999680985c2e4b8e1ca38a23496571 -0.085978  0.072255 -0.038656 -0.039911   \n",
       "...                                    ...       ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3 -0.024419  0.187523 -0.039731 -0.151485   \n",
       "730890cd34ac161117a612deee0921ac  0.103679 -0.204068 -0.108106 -0.293693   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  0.047227  0.046551 -0.085019 -0.325037   \n",
       "96127de9846eb55b3fe4ccedb23ded4b  0.178825  0.092693 -0.147028 -0.178634   \n",
       "eda99244ee59adc030fa832298eb3fc2 -0.014867   0.29465 -0.030907 -0.344912   \n",
       "\n",
       "                                        4         5         6         7   \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9 -0.032228  0.073984 -0.036641 -0.074896   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e -0.051804 -0.051109  0.044799  0.093681   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.086201 -0.081579  0.056584 -0.019975   \n",
       "aed3813753bb154c31ba0baa5755fa52 -0.223334 -0.139857 -0.117373 -0.003364   \n",
       "50999680985c2e4b8e1ca38a23496571 -0.197762  0.008704 -0.178474  0.028002   \n",
       "...                                    ...       ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3 -0.138058 -0.066764  0.025493  0.074366   \n",
       "730890cd34ac161117a612deee0921ac  0.074555 -0.235991 -0.049014  0.029087   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a -0.049108  0.103695  0.029109 -0.009595   \n",
       "96127de9846eb55b3fe4ccedb23ded4b -0.116189  0.010839 -0.285221 -0.074372   \n",
       "eda99244ee59adc030fa832298eb3fc2 -0.181097 -0.017894  0.033809  -0.10483   \n",
       "\n",
       "                                        8         9   ...        63        64  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9  0.079085 -0.028804  ...  0.254407 -0.150296   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  0.117761 -0.113826  ...  0.042109 -0.098792   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.073882  0.073419  ...  0.103352  -0.03589   \n",
       "aed3813753bb154c31ba0baa5755fa52  0.165045  0.138165  ... -0.006653 -0.272331   \n",
       "50999680985c2e4b8e1ca38a23496571  0.031365 -0.051818  ...  0.166202 -0.143758   \n",
       "...                                    ...       ...  ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  0.194684  0.019213  ...  0.143771 -0.197254   \n",
       "730890cd34ac161117a612deee0921ac  0.175783 -0.071693  ...  0.117723  -0.13412   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  0.276551 -0.176007  ...  0.233791 -0.082447   \n",
       "96127de9846eb55b3fe4ccedb23ded4b  0.054715 -0.161686  ...  0.204444 -0.202406   \n",
       "eda99244ee59adc030fa832298eb3fc2 -0.078909 -0.224345  ...  0.160177 -0.073975   \n",
       "\n",
       "                                        65        66        67        68  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9  0.124841  0.016994  0.197462   0.25678   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  0.247691 -0.036003  0.248261  0.196554   \n",
       "a06f451ac97e416a3e23dc83d779b450  0.182418 -0.146269 -0.009281  0.258288   \n",
       "aed3813753bb154c31ba0baa5755fa52  0.271812  0.065179  0.224623  0.079269   \n",
       "50999680985c2e4b8e1ca38a23496571  0.170843 -0.087539  0.126389  0.093644   \n",
       "...                                    ...       ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  0.049204 -0.064431   -0.0918  0.059095   \n",
       "730890cd34ac161117a612deee0921ac  0.142837  0.074518  0.117027 -0.001979   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  0.042312 -0.025665   0.28851  0.231893   \n",
       "96127de9846eb55b3fe4ccedb23ded4b -0.010553 -0.177014  0.221002  0.279636   \n",
       "eda99244ee59adc030fa832298eb3fc2  0.349831  0.095981   0.10786  0.216804   \n",
       "\n",
       "                                        69        70        71  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9 -0.019747  0.063751 -0.162092   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  -0.00164  -0.17506 -0.025497   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.056532  0.179102 -0.105327   \n",
       "aed3813753bb154c31ba0baa5755fa52 -0.016956  -0.02744 -0.155299   \n",
       "50999680985c2e4b8e1ca38a23496571  -0.10491 -0.160005  0.017439   \n",
       "...                                    ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3   0.18631 -0.051492 -0.171931   \n",
       "730890cd34ac161117a612deee0921ac  0.049145  0.240171  -0.20419   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a -0.047362  0.218689 -0.194203   \n",
       "96127de9846eb55b3fe4ccedb23ded4b -0.006578  0.110608 -0.174116   \n",
       "eda99244ee59adc030fa832298eb3fc2  0.081045 -0.092526 -0.149761   \n",
       "\n",
       "                                                                72  \n",
       "4e6091b8553e6fb25c867cdc7d5608d9  4e6091b8553e6fb25c867cdc7d5608d9  \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  3ac2ed6c901a8c1f61aaccd5f535610e  \n",
       "a06f451ac97e416a3e23dc83d779b450  a06f451ac97e416a3e23dc83d779b450  \n",
       "aed3813753bb154c31ba0baa5755fa52  aed3813753bb154c31ba0baa5755fa52  \n",
       "50999680985c2e4b8e1ca38a23496571  50999680985c2e4b8e1ca38a23496571  \n",
       "...                                                            ...  \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  c6716ba9e95a880fa57041ec6f2cb8e3  \n",
       "730890cd34ac161117a612deee0921ac  730890cd34ac161117a612deee0921ac  \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  57e5e7fcd9287ddaf5195e6e0230e01a  \n",
       "96127de9846eb55b3fe4ccedb23ded4b  96127de9846eb55b3fe4ccedb23ded4b  \n",
       "eda99244ee59adc030fa832298eb3fc2  eda99244ee59adc030fa832298eb3fc2  \n",
       "\n",
       "[12325 rows x 73 columns]"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vid_bert_em_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "id": "e95e733d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "vid_bert_em_pd=vid_bert_em_pd.rename(columns={72:'vid_next'}) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "id": "7dcd1485",
   "metadata": {},
   "outputs": [],
   "source": [
    "vid_bert_em_pd_tmp=vid_bert_em_pd[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]].astype(dtype=float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "id": "cc04f302",
   "metadata": {},
   "outputs": [],
   "source": [
    "vid_bert_em_pd_fin=pd.concat([vid_bert_em_pd_tmp,vid_bert_em_pd['vid_next']],axis=1,join='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "id": "95010788",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>63</th>\n",
       "      <th>64</th>\n",
       "      <th>65</th>\n",
       "      <th>66</th>\n",
       "      <th>67</th>\n",
       "      <th>68</th>\n",
       "      <th>69</th>\n",
       "      <th>70</th>\n",
       "      <th>71</th>\n",
       "      <th>vid_next</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4e6091b8553e6fb25c867cdc7d5608d9</th>\n",
       "      <td>0.224221</td>\n",
       "      <td>0.163928</td>\n",
       "      <td>-0.096474</td>\n",
       "      <td>-0.313437</td>\n",
       "      <td>-0.032228</td>\n",
       "      <td>0.073984</td>\n",
       "      <td>-0.036641</td>\n",
       "      <td>-0.074896</td>\n",
       "      <td>0.079085</td>\n",
       "      <td>-0.028804</td>\n",
       "      <td>...</td>\n",
       "      <td>0.254407</td>\n",
       "      <td>-0.150296</td>\n",
       "      <td>0.124841</td>\n",
       "      <td>0.016994</td>\n",
       "      <td>0.197462</td>\n",
       "      <td>0.256780</td>\n",
       "      <td>-0.019747</td>\n",
       "      <td>0.063751</td>\n",
       "      <td>-0.162092</td>\n",
       "      <td>4e6091b8553e6fb25c867cdc7d5608d9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3ac2ed6c901a8c1f61aaccd5f535610e</th>\n",
       "      <td>-0.046063</td>\n",
       "      <td>-0.060179</td>\n",
       "      <td>0.122003</td>\n",
       "      <td>-0.290268</td>\n",
       "      <td>-0.051804</td>\n",
       "      <td>-0.051109</td>\n",
       "      <td>0.044799</td>\n",
       "      <td>0.093681</td>\n",
       "      <td>0.117761</td>\n",
       "      <td>-0.113826</td>\n",
       "      <td>...</td>\n",
       "      <td>0.042109</td>\n",
       "      <td>-0.098792</td>\n",
       "      <td>0.247691</td>\n",
       "      <td>-0.036003</td>\n",
       "      <td>0.248261</td>\n",
       "      <td>0.196554</td>\n",
       "      <td>-0.001640</td>\n",
       "      <td>-0.175060</td>\n",
       "      <td>-0.025497</td>\n",
       "      <td>3ac2ed6c901a8c1f61aaccd5f535610e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>a06f451ac97e416a3e23dc83d779b450</th>\n",
       "      <td>-0.088922</td>\n",
       "      <td>0.095300</td>\n",
       "      <td>0.044043</td>\n",
       "      <td>-0.281063</td>\n",
       "      <td>-0.086201</td>\n",
       "      <td>-0.081579</td>\n",
       "      <td>0.056584</td>\n",
       "      <td>-0.019975</td>\n",
       "      <td>-0.073882</td>\n",
       "      <td>0.073419</td>\n",
       "      <td>...</td>\n",
       "      <td>0.103352</td>\n",
       "      <td>-0.035890</td>\n",
       "      <td>0.182418</td>\n",
       "      <td>-0.146269</td>\n",
       "      <td>-0.009281</td>\n",
       "      <td>0.258288</td>\n",
       "      <td>-0.056532</td>\n",
       "      <td>0.179102</td>\n",
       "      <td>-0.105327</td>\n",
       "      <td>a06f451ac97e416a3e23dc83d779b450</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>aed3813753bb154c31ba0baa5755fa52</th>\n",
       "      <td>0.265366</td>\n",
       "      <td>0.069603</td>\n",
       "      <td>-0.116056</td>\n",
       "      <td>-0.064341</td>\n",
       "      <td>-0.223334</td>\n",
       "      <td>-0.139857</td>\n",
       "      <td>-0.117373</td>\n",
       "      <td>-0.003364</td>\n",
       "      <td>0.165045</td>\n",
       "      <td>0.138165</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.006653</td>\n",
       "      <td>-0.272331</td>\n",
       "      <td>0.271812</td>\n",
       "      <td>0.065179</td>\n",
       "      <td>0.224623</td>\n",
       "      <td>0.079269</td>\n",
       "      <td>-0.016956</td>\n",
       "      <td>-0.027440</td>\n",
       "      <td>-0.155299</td>\n",
       "      <td>aed3813753bb154c31ba0baa5755fa52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50999680985c2e4b8e1ca38a23496571</th>\n",
       "      <td>-0.085978</td>\n",
       "      <td>0.072255</td>\n",
       "      <td>-0.038656</td>\n",
       "      <td>-0.039911</td>\n",
       "      <td>-0.197762</td>\n",
       "      <td>0.008704</td>\n",
       "      <td>-0.178474</td>\n",
       "      <td>0.028002</td>\n",
       "      <td>0.031365</td>\n",
       "      <td>-0.051818</td>\n",
       "      <td>...</td>\n",
       "      <td>0.166202</td>\n",
       "      <td>-0.143758</td>\n",
       "      <td>0.170843</td>\n",
       "      <td>-0.087539</td>\n",
       "      <td>0.126389</td>\n",
       "      <td>0.093644</td>\n",
       "      <td>-0.104910</td>\n",
       "      <td>-0.160005</td>\n",
       "      <td>0.017439</td>\n",
       "      <td>50999680985c2e4b8e1ca38a23496571</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>c6716ba9e95a880fa57041ec6f2cb8e3</th>\n",
       "      <td>-0.024419</td>\n",
       "      <td>0.187523</td>\n",
       "      <td>-0.039731</td>\n",
       "      <td>-0.151485</td>\n",
       "      <td>-0.138058</td>\n",
       "      <td>-0.066764</td>\n",
       "      <td>0.025493</td>\n",
       "      <td>0.074366</td>\n",
       "      <td>0.194684</td>\n",
       "      <td>0.019213</td>\n",
       "      <td>...</td>\n",
       "      <td>0.143771</td>\n",
       "      <td>-0.197254</td>\n",
       "      <td>0.049204</td>\n",
       "      <td>-0.064431</td>\n",
       "      <td>-0.091800</td>\n",
       "      <td>0.059095</td>\n",
       "      <td>0.186310</td>\n",
       "      <td>-0.051492</td>\n",
       "      <td>-0.171931</td>\n",
       "      <td>c6716ba9e95a880fa57041ec6f2cb8e3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>730890cd34ac161117a612deee0921ac</th>\n",
       "      <td>0.103679</td>\n",
       "      <td>-0.204068</td>\n",
       "      <td>-0.108106</td>\n",
       "      <td>-0.293693</td>\n",
       "      <td>0.074555</td>\n",
       "      <td>-0.235991</td>\n",
       "      <td>-0.049014</td>\n",
       "      <td>0.029087</td>\n",
       "      <td>0.175783</td>\n",
       "      <td>-0.071693</td>\n",
       "      <td>...</td>\n",
       "      <td>0.117723</td>\n",
       "      <td>-0.134120</td>\n",
       "      <td>0.142837</td>\n",
       "      <td>0.074518</td>\n",
       "      <td>0.117027</td>\n",
       "      <td>-0.001979</td>\n",
       "      <td>0.049145</td>\n",
       "      <td>0.240171</td>\n",
       "      <td>-0.204190</td>\n",
       "      <td>730890cd34ac161117a612deee0921ac</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57e5e7fcd9287ddaf5195e6e0230e01a</th>\n",
       "      <td>0.047227</td>\n",
       "      <td>0.046551</td>\n",
       "      <td>-0.085019</td>\n",
       "      <td>-0.325037</td>\n",
       "      <td>-0.049108</td>\n",
       "      <td>0.103695</td>\n",
       "      <td>0.029109</td>\n",
       "      <td>-0.009595</td>\n",
       "      <td>0.276551</td>\n",
       "      <td>-0.176007</td>\n",
       "      <td>...</td>\n",
       "      <td>0.233791</td>\n",
       "      <td>-0.082447</td>\n",
       "      <td>0.042312</td>\n",
       "      <td>-0.025665</td>\n",
       "      <td>0.288510</td>\n",
       "      <td>0.231893</td>\n",
       "      <td>-0.047362</td>\n",
       "      <td>0.218689</td>\n",
       "      <td>-0.194203</td>\n",
       "      <td>57e5e7fcd9287ddaf5195e6e0230e01a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96127de9846eb55b3fe4ccedb23ded4b</th>\n",
       "      <td>0.178825</td>\n",
       "      <td>0.092693</td>\n",
       "      <td>-0.147028</td>\n",
       "      <td>-0.178634</td>\n",
       "      <td>-0.116189</td>\n",
       "      <td>0.010839</td>\n",
       "      <td>-0.285221</td>\n",
       "      <td>-0.074372</td>\n",
       "      <td>0.054715</td>\n",
       "      <td>-0.161686</td>\n",
       "      <td>...</td>\n",
       "      <td>0.204444</td>\n",
       "      <td>-0.202406</td>\n",
       "      <td>-0.010553</td>\n",
       "      <td>-0.177014</td>\n",
       "      <td>0.221002</td>\n",
       "      <td>0.279636</td>\n",
       "      <td>-0.006578</td>\n",
       "      <td>0.110608</td>\n",
       "      <td>-0.174116</td>\n",
       "      <td>96127de9846eb55b3fe4ccedb23ded4b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>eda99244ee59adc030fa832298eb3fc2</th>\n",
       "      <td>-0.014867</td>\n",
       "      <td>0.294650</td>\n",
       "      <td>-0.030907</td>\n",
       "      <td>-0.344912</td>\n",
       "      <td>-0.181097</td>\n",
       "      <td>-0.017894</td>\n",
       "      <td>0.033809</td>\n",
       "      <td>-0.104830</td>\n",
       "      <td>-0.078909</td>\n",
       "      <td>-0.224345</td>\n",
       "      <td>...</td>\n",
       "      <td>0.160177</td>\n",
       "      <td>-0.073975</td>\n",
       "      <td>0.349831</td>\n",
       "      <td>0.095981</td>\n",
       "      <td>0.107860</td>\n",
       "      <td>0.216804</td>\n",
       "      <td>0.081045</td>\n",
       "      <td>-0.092526</td>\n",
       "      <td>-0.149761</td>\n",
       "      <td>eda99244ee59adc030fa832298eb3fc2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12325 rows × 73 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         0         1         2         3  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9  0.224221  0.163928 -0.096474 -0.313437   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e -0.046063 -0.060179  0.122003 -0.290268   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.088922  0.095300  0.044043 -0.281063   \n",
       "aed3813753bb154c31ba0baa5755fa52  0.265366  0.069603 -0.116056 -0.064341   \n",
       "50999680985c2e4b8e1ca38a23496571 -0.085978  0.072255 -0.038656 -0.039911   \n",
       "...                                    ...       ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3 -0.024419  0.187523 -0.039731 -0.151485   \n",
       "730890cd34ac161117a612deee0921ac  0.103679 -0.204068 -0.108106 -0.293693   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  0.047227  0.046551 -0.085019 -0.325037   \n",
       "96127de9846eb55b3fe4ccedb23ded4b  0.178825  0.092693 -0.147028 -0.178634   \n",
       "eda99244ee59adc030fa832298eb3fc2 -0.014867  0.294650 -0.030907 -0.344912   \n",
       "\n",
       "                                         4         5         6         7  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9 -0.032228  0.073984 -0.036641 -0.074896   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e -0.051804 -0.051109  0.044799  0.093681   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.086201 -0.081579  0.056584 -0.019975   \n",
       "aed3813753bb154c31ba0baa5755fa52 -0.223334 -0.139857 -0.117373 -0.003364   \n",
       "50999680985c2e4b8e1ca38a23496571 -0.197762  0.008704 -0.178474  0.028002   \n",
       "...                                    ...       ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3 -0.138058 -0.066764  0.025493  0.074366   \n",
       "730890cd34ac161117a612deee0921ac  0.074555 -0.235991 -0.049014  0.029087   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a -0.049108  0.103695  0.029109 -0.009595   \n",
       "96127de9846eb55b3fe4ccedb23ded4b -0.116189  0.010839 -0.285221 -0.074372   \n",
       "eda99244ee59adc030fa832298eb3fc2 -0.181097 -0.017894  0.033809 -0.104830   \n",
       "\n",
       "                                         8         9  ...        63        64  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9  0.079085 -0.028804  ...  0.254407 -0.150296   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  0.117761 -0.113826  ...  0.042109 -0.098792   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.073882  0.073419  ...  0.103352 -0.035890   \n",
       "aed3813753bb154c31ba0baa5755fa52  0.165045  0.138165  ... -0.006653 -0.272331   \n",
       "50999680985c2e4b8e1ca38a23496571  0.031365 -0.051818  ...  0.166202 -0.143758   \n",
       "...                                    ...       ...  ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  0.194684  0.019213  ...  0.143771 -0.197254   \n",
       "730890cd34ac161117a612deee0921ac  0.175783 -0.071693  ...  0.117723 -0.134120   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  0.276551 -0.176007  ...  0.233791 -0.082447   \n",
       "96127de9846eb55b3fe4ccedb23ded4b  0.054715 -0.161686  ...  0.204444 -0.202406   \n",
       "eda99244ee59adc030fa832298eb3fc2 -0.078909 -0.224345  ...  0.160177 -0.073975   \n",
       "\n",
       "                                        65        66        67        68  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9  0.124841  0.016994  0.197462  0.256780   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  0.247691 -0.036003  0.248261  0.196554   \n",
       "a06f451ac97e416a3e23dc83d779b450  0.182418 -0.146269 -0.009281  0.258288   \n",
       "aed3813753bb154c31ba0baa5755fa52  0.271812  0.065179  0.224623  0.079269   \n",
       "50999680985c2e4b8e1ca38a23496571  0.170843 -0.087539  0.126389  0.093644   \n",
       "...                                    ...       ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  0.049204 -0.064431 -0.091800  0.059095   \n",
       "730890cd34ac161117a612deee0921ac  0.142837  0.074518  0.117027 -0.001979   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  0.042312 -0.025665  0.288510  0.231893   \n",
       "96127de9846eb55b3fe4ccedb23ded4b -0.010553 -0.177014  0.221002  0.279636   \n",
       "eda99244ee59adc030fa832298eb3fc2  0.349831  0.095981  0.107860  0.216804   \n",
       "\n",
       "                                        69        70        71  \\\n",
       "4e6091b8553e6fb25c867cdc7d5608d9 -0.019747  0.063751 -0.162092   \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e -0.001640 -0.175060 -0.025497   \n",
       "a06f451ac97e416a3e23dc83d779b450 -0.056532  0.179102 -0.105327   \n",
       "aed3813753bb154c31ba0baa5755fa52 -0.016956 -0.027440 -0.155299   \n",
       "50999680985c2e4b8e1ca38a23496571 -0.104910 -0.160005  0.017439   \n",
       "...                                    ...       ...       ...   \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  0.186310 -0.051492 -0.171931   \n",
       "730890cd34ac161117a612deee0921ac  0.049145  0.240171 -0.204190   \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a -0.047362  0.218689 -0.194203   \n",
       "96127de9846eb55b3fe4ccedb23ded4b -0.006578  0.110608 -0.174116   \n",
       "eda99244ee59adc030fa832298eb3fc2  0.081045 -0.092526 -0.149761   \n",
       "\n",
       "                                                          vid_next  \n",
       "4e6091b8553e6fb25c867cdc7d5608d9  4e6091b8553e6fb25c867cdc7d5608d9  \n",
       "3ac2ed6c901a8c1f61aaccd5f535610e  3ac2ed6c901a8c1f61aaccd5f535610e  \n",
       "a06f451ac97e416a3e23dc83d779b450  a06f451ac97e416a3e23dc83d779b450  \n",
       "aed3813753bb154c31ba0baa5755fa52  aed3813753bb154c31ba0baa5755fa52  \n",
       "50999680985c2e4b8e1ca38a23496571  50999680985c2e4b8e1ca38a23496571  \n",
       "...                                                            ...  \n",
       "c6716ba9e95a880fa57041ec6f2cb8e3  c6716ba9e95a880fa57041ec6f2cb8e3  \n",
       "730890cd34ac161117a612deee0921ac  730890cd34ac161117a612deee0921ac  \n",
       "57e5e7fcd9287ddaf5195e6e0230e01a  57e5e7fcd9287ddaf5195e6e0230e01a  \n",
       "96127de9846eb55b3fe4ccedb23ded4b  96127de9846eb55b3fe4ccedb23ded4b  \n",
       "eda99244ee59adc030fa832298eb3fc2  eda99244ee59adc030fa832298eb3fc2  \n",
       "\n",
       "[12325 rows x 73 columns]"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vid_bert_em_pd_fin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "20f314f0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>did</th>\n",
       "      <th>vid</th>\n",
       "      <th>vts</th>\n",
       "      <th>hb</th>\n",
       "      <th>seq_no</th>\n",
       "      <th>cpn</th>\n",
       "      <th>fpn</th>\n",
       "      <th>time_gap</th>\n",
       "      <th>rank</th>\n",
       "      <th>vid_next</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2696556</td>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>449a1829a8742e652cb39d6ae7523df1</td>\n",
       "      <td>1620.0</td>\n",
       "      <td>1983.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1</td>\n",
       "      <td>130</td>\n",
       "      <td>3103.0</td>\n",
       "      <td>3</td>\n",
       "      <td>815a870db0d94166e330ae97f7068de2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2696557</td>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>bee6becd984e6486215f7acd876b5d26</td>\n",
       "      <td>2714.0</td>\n",
       "      <td>2611.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1</td>\n",
       "      <td>68</td>\n",
       "      <td>546847.0</td>\n",
       "      <td>4</td>\n",
       "      <td>449a1829a8742e652cb39d6ae7523df1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2696558</td>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>2c47a9311f2f19d6b33670715cdc544d</td>\n",
       "      <td>780.0</td>\n",
       "      <td>7715.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1</td>\n",
       "      <td>130</td>\n",
       "      <td>57645.0</td>\n",
       "      <td>5</td>\n",
       "      <td>bee6becd984e6486215f7acd876b5d26</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0                               did  \\\n",
       "2     2696556  0000d0aabe8c188f88c756ce0f7f9639   \n",
       "3     2696557  0000d0aabe8c188f88c756ce0f7f9639   \n",
       "4     2696558  0000d0aabe8c188f88c756ce0f7f9639   \n",
       "\n",
       "                                vid     vts      hb  seq_no  cpn  fpn  \\\n",
       "2  449a1829a8742e652cb39d6ae7523df1  1620.0  1983.0     5.0    1  130   \n",
       "3  bee6becd984e6486215f7acd876b5d26  2714.0  2611.0     4.0    1   68   \n",
       "4  2c47a9311f2f19d6b33670715cdc544d   780.0  7715.0     3.0    1  130   \n",
       "\n",
       "   time_gap  rank                          vid_next  \n",
       "2    3103.0     3  815a870db0d94166e330ae97f7068de2  \n",
       "3  546847.0     4  449a1829a8742e652cb39d6ae7523df1  \n",
       "4   57645.0     5  bee6becd984e6486215f7acd876b5d26  "
      ]
     },
     "execution_count": 175,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_origin_data[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "id": "7dff85b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def recall_video_by_hot_a_to_b(data, candidate_items, vid_info, rank=1, rank_num=30):\n",
    "    \n",
    "    tmp =data[['vid', 'vid_next']]\n",
    "    #增加热点视频，召回时考虑\n",
    "    hot_list = data[data['rank']==rank].vid.value_counts().head(6).index\n",
    "    top_hot_list = pd.DataFrame(hot_list, columns=['vid_next'])\n",
    "    top_hot_list['key'] = 1\n",
    "    \n",
    "    hot_tmp = data[['vid']]\n",
    "    hot_tmp = hot_tmp.drop_duplicates(keep='first')\n",
    "    hot_tmp['key']=1\n",
    "    hot_tmp = hot_tmp.merge(top_hot_list, on='key', how='inner')\n",
    "    hot_tmp = hot_tmp[['vid', 'vid_next']]\n",
    "    \n",
    "    tmp = pd.concat([tmp, hot_tmp])\n",
    "    \n",
    "    \n",
    "    tmp = tmp.groupby(['vid','vid_next']).size().sort_values(ascending=False).reset_index() #统计数目\n",
    "    tmp = tmp[tmp.vid_next.isin(candidate_items.vid.unique())]                    #筛选在预测集中的\n",
    "    tmp = tmp.groupby('vid').head(rank_num)                                   # 保留最多的hot vid-->n_vid 链\n",
    "    tmp = tmp.rename(columns={'vid_next':'recall_vid_next'})                  # did, vid, \n",
    "    base =  data[data['rank']==rank][['did','vid','vid_next']] \n",
    "    base = base.merge(tmp, on='vid', how='left') # did, vid, vid_next, recall_vid_next, 0\n",
    "    base['label'] = 0\n",
    "\n",
    "    \n",
    "    base.loc[base['vid_next']==base['recall_vid_next'],['label']] = 1\n",
    "    base = base[['did', 'vid', 'recall_vid_next', 'label']].rename(columns={'recall_vid_next':'vid_next'})\n",
    "    base = base.drop_duplicates(keep='first')\n",
    "    \n",
    "    return base"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "id": "f7bf1635",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_d = recall_video_by_hot_a_to_b(train_origin_data, candidate_items, vid_info, rank=3, rank_num=5)\n",
    "test_d = recall_video_by_hot_a_to_b(test_origin_data, candidate_items, vid_info, rank=1, rank_num=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "id": "1f939734",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import *\n",
    "\n",
    "#构造特征\n",
    "def make_feature(data, vid_info, origin_data, rank=3):\n",
    "    \n",
    "    # var_data = did, vid, vid_next, label\n",
    "    var_data = data.copy()\n",
    "    # 链接vid表\n",
    "    var_data = var_data.merge(vid_info, on='vid', how='left')\n",
    "    # df_fvid = vid_next, fvid_cid, fvid_serialno, fvid_duration\n",
    "    df_fvid = vid_info[['vid', 'cid', 'serialno','duration']].rename(columns={'vid':'vid_next','cid':'fvid_cid', 'serialno':'fvid_serialno','duration':'fvid_duration'})\n",
    "    # 方便进行cid等信息比对\n",
    "    var_data = pd.merge(var_data, df_fvid, on='vid_next', how = 'left')\n",
    "    # 将rank信息设置成0、1\n",
    "    df_click_data = origin_data[origin_data['rank']>=rank]\n",
    "    df_click_data = pd.merge(df_click_data, vid_info[['vid','cid']], on='vid', how='left')\n",
    "    # 相同cid特征\n",
    "    # 增加 fvid_vid_same_cid，是否属于同一个cid\n",
    "    var_data = get_fvid_vid_same_cid(var_data, vid_info)\n",
    "    # 增加 update_vid,当前观看的cid是否是之后的cid\n",
    "    var_data = get_update_vid(var_data, vid_info)\n",
    "    # 增加 update_vid_abs,当前观看的cid是否是上个cid的近n集\n",
    "    var_data = get_update_vid_abs(var_data, vid_info,1)\n",
    "    var_data = var_data.rename(columns={'update_vid_abs':'update_vid_abs_1'})\n",
    "    var_data = get_update_vid_abs(var_data, vid_info,2)\n",
    "    var_data = var_data.rename(columns={'update_vid_abs':'update_vid_abs_2'})\n",
    "    var_data = get_update_vid_abs(var_data, vid_info,3)\n",
    "    var_data = var_data.rename(columns={'update_vid_abs':'update_vid_abs_3'})\n",
    "    var_data = get_update_vid_abs(var_data, vid_info,5)\n",
    "    var_data = var_data.rename(columns={'update_vid_abs':'update_vid_abs_5'})\n",
    "    \n",
    "    # time_gap\n",
    "    var_data = get_vid_last_click_time_diff(var_data, origin_data,rank)\n",
    "\n",
    "    for i in ['vid_next', 'vid', 'cid']:\n",
    "        var_data = feature_click_unique(var_data, df_click_data, i, 'did')\n",
    "\n",
    "#     for i in ['vid', 'cid']:\n",
    "#         var_data = feature_cross_count(var_data, df_click_data, ['vid_next', i])\n",
    "\n",
    "#     # cross count\n",
    "#     for i in ['vid_next', 'vid', 'cid']:\n",
    "#         var_data = feature_cross_count(var_data, df_click_data, ['did', i])\n",
    "\n",
    "#     for i in ['vid_next', 'vid', 'cid']:\n",
    "#             var_data = feature_cross_sum(var_data, df_click_data, ['did', i])\n",
    "#             var_data = feature_cross_mean(var_data, df_click_data, ['did', i])\n",
    "\n",
    "#     for i in ['vid', 'cid']:\n",
    "#             var_data = feature_cross_sum(var_data, df_click_data, ['vid_next', i])\n",
    "#             var_data = feature_cross_mean(var_data, df_click_data, ['vid_next', i])\n",
    "    var_data.fillna(0,inplace=True)\n",
    "    return var_data\n",
    "\n",
    "train_features = make_feature(train_d, vid_info, train_origin_data, 2)\n",
    "test_features = make_feature(test_d, vid_info, test_origin_data, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "id": "7b1aa636",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_features = train_features.merge(vid_bert_em_pd_fin, on='vid_next', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "id": "91b089e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = train_features[:800000]\n",
    "valid_data = train_features[800000:]\n",
    "test_data = test_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "id": "aa4781e5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>did</th>\n",
       "      <th>vid</th>\n",
       "      <th>vid_next</th>\n",
       "      <th>label</th>\n",
       "      <th>cid</th>\n",
       "      <th>is_intact</th>\n",
       "      <th>serialno</th>\n",
       "      <th>classify_id</th>\n",
       "      <th>series_id</th>\n",
       "      <th>duration</th>\n",
       "      <th>...</th>\n",
       "      <th>62</th>\n",
       "      <th>63</th>\n",
       "      <th>64</th>\n",
       "      <th>65</th>\n",
       "      <th>66</th>\n",
       "      <th>67</th>\n",
       "      <th>68</th>\n",
       "      <th>69</th>\n",
       "      <th>70</th>\n",
       "      <th>71</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>800000</th>\n",
       "      <td>efc919c0505fffdc63c6a3081f6f6226</td>\n",
       "      <td>6dca3fdb207e0587150d1fd0d3860307</td>\n",
       "      <td>31e2a3c49a123ae3acb130828881ad76</td>\n",
       "      <td>0</td>\n",
       "      <td>04dedb2a4e44c26542676ad1fc74d9ef</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>...</td>\n",
       "      <td>0.089897</td>\n",
       "      <td>0.066589</td>\n",
       "      <td>0.042069</td>\n",
       "      <td>0.002364</td>\n",
       "      <td>-0.023163</td>\n",
       "      <td>0.000433</td>\n",
       "      <td>0.151972</td>\n",
       "      <td>-0.030058</td>\n",
       "      <td>0.221805</td>\n",
       "      <td>-0.145798</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800001</th>\n",
       "      <td>efc919c0505fffdc63c6a3081f6f6226</td>\n",
       "      <td>6dca3fdb207e0587150d1fd0d3860307</td>\n",
       "      <td>388d453f41a9275830fd47a5a9bc1023</td>\n",
       "      <td>1</td>\n",
       "      <td>04dedb2a4e44c26542676ad1fc74d9ef</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>...</td>\n",
       "      <td>0.027210</td>\n",
       "      <td>0.176136</td>\n",
       "      <td>-0.148914</td>\n",
       "      <td>0.321954</td>\n",
       "      <td>-0.108836</td>\n",
       "      <td>0.274061</td>\n",
       "      <td>0.280797</td>\n",
       "      <td>-0.091574</td>\n",
       "      <td>0.107974</td>\n",
       "      <td>0.158113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800002</th>\n",
       "      <td>efc919c0505fffdc63c6a3081f6f6226</td>\n",
       "      <td>6dca3fdb207e0587150d1fd0d3860307</td>\n",
       "      <td>0cc56b6914c87d906b23e6d90cd4f65e</td>\n",
       "      <td>0</td>\n",
       "      <td>04dedb2a4e44c26542676ad1fc74d9ef</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>...</td>\n",
       "      <td>0.003277</td>\n",
       "      <td>0.081982</td>\n",
       "      <td>-0.049158</td>\n",
       "      <td>0.145481</td>\n",
       "      <td>0.019086</td>\n",
       "      <td>0.129693</td>\n",
       "      <td>0.025949</td>\n",
       "      <td>-0.244428</td>\n",
       "      <td>0.291938</td>\n",
       "      <td>0.099707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800003</th>\n",
       "      <td>efc919c0505fffdc63c6a3081f6f6226</td>\n",
       "      <td>6dca3fdb207e0587150d1fd0d3860307</td>\n",
       "      <td>f2cc06a8d6fa7513a7597e02808e5c7e</td>\n",
       "      <td>0</td>\n",
       "      <td>04dedb2a4e44c26542676ad1fc74d9ef</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>...</td>\n",
       "      <td>0.232113</td>\n",
       "      <td>0.259245</td>\n",
       "      <td>0.015874</td>\n",
       "      <td>0.128125</td>\n",
       "      <td>0.272246</td>\n",
       "      <td>0.147186</td>\n",
       "      <td>-0.018089</td>\n",
       "      <td>0.122752</td>\n",
       "      <td>0.378198</td>\n",
       "      <td>-0.145448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800004</th>\n",
       "      <td>efc919c0505fffdc63c6a3081f6f6226</td>\n",
       "      <td>6dca3fdb207e0587150d1fd0d3860307</td>\n",
       "      <td>3be18c575207dd28d57ba7adc99a3b0a</td>\n",
       "      <td>0</td>\n",
       "      <td>04dedb2a4e44c26542676ad1fc74d9ef</td>\n",
       "      <td>3</td>\n",
       "      <td>22</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>81</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.001586</td>\n",
       "      <td>0.091517</td>\n",
       "      <td>-0.086195</td>\n",
       "      <td>0.093147</td>\n",
       "      <td>0.026626</td>\n",
       "      <td>0.288764</td>\n",
       "      <td>-0.024857</td>\n",
       "      <td>-0.125218</td>\n",
       "      <td>0.274347</td>\n",
       "      <td>-0.240868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800005</th>\n",
       "      <td>efc9d9350ad48b55aaf2f0e9bf40f851</td>\n",
       "      <td>dd4ad1276e13603b6a15704295ba6ec8</td>\n",
       "      <td>7d697b54355da49d2a6949e9969bf828</td>\n",
       "      <td>0</td>\n",
       "      <td>f703a7e171878113d1854d9e25b1df7f</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>97664</td>\n",
       "      <td>6170</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.185058</td>\n",
       "      <td>0.054995</td>\n",
       "      <td>-0.143772</td>\n",
       "      <td>0.198850</td>\n",
       "      <td>-0.189167</td>\n",
       "      <td>0.191430</td>\n",
       "      <td>0.276149</td>\n",
       "      <td>0.028904</td>\n",
       "      <td>-0.043255</td>\n",
       "      <td>-0.009367</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800006</th>\n",
       "      <td>efc9d9350ad48b55aaf2f0e9bf40f851</td>\n",
       "      <td>dd4ad1276e13603b6a15704295ba6ec8</td>\n",
       "      <td>0cc56b6914c87d906b23e6d90cd4f65e</td>\n",
       "      <td>1</td>\n",
       "      <td>f703a7e171878113d1854d9e25b1df7f</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>97664</td>\n",
       "      <td>6170</td>\n",
       "      <td>...</td>\n",
       "      <td>0.003277</td>\n",
       "      <td>0.081982</td>\n",
       "      <td>-0.049158</td>\n",
       "      <td>0.145481</td>\n",
       "      <td>0.019086</td>\n",
       "      <td>0.129693</td>\n",
       "      <td>0.025949</td>\n",
       "      <td>-0.244428</td>\n",
       "      <td>0.291938</td>\n",
       "      <td>0.099707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800007</th>\n",
       "      <td>efc9d9350ad48b55aaf2f0e9bf40f851</td>\n",
       "      <td>dd4ad1276e13603b6a15704295ba6ec8</td>\n",
       "      <td>8d0a739e77f24c5231c40f19c8c36dee</td>\n",
       "      <td>0</td>\n",
       "      <td>f703a7e171878113d1854d9e25b1df7f</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>97664</td>\n",
       "      <td>6170</td>\n",
       "      <td>...</td>\n",
       "      <td>0.088733</td>\n",
       "      <td>0.111608</td>\n",
       "      <td>-0.010785</td>\n",
       "      <td>0.379843</td>\n",
       "      <td>-0.161900</td>\n",
       "      <td>0.209395</td>\n",
       "      <td>-0.054082</td>\n",
       "      <td>-0.211781</td>\n",
       "      <td>-0.084917</td>\n",
       "      <td>0.101512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800008</th>\n",
       "      <td>efc9d9350ad48b55aaf2f0e9bf40f851</td>\n",
       "      <td>dd4ad1276e13603b6a15704295ba6ec8</td>\n",
       "      <td>114badee091c5d3faab0f620e608d6c0</td>\n",
       "      <td>0</td>\n",
       "      <td>f703a7e171878113d1854d9e25b1df7f</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>97664</td>\n",
       "      <td>6170</td>\n",
       "      <td>...</td>\n",
       "      <td>0.112283</td>\n",
       "      <td>0.148375</td>\n",
       "      <td>-0.156462</td>\n",
       "      <td>0.330316</td>\n",
       "      <td>-0.129281</td>\n",
       "      <td>0.022817</td>\n",
       "      <td>-0.043506</td>\n",
       "      <td>-0.016560</td>\n",
       "      <td>0.013073</td>\n",
       "      <td>-0.156181</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800009</th>\n",
       "      <td>efc9d9350ad48b55aaf2f0e9bf40f851</td>\n",
       "      <td>dd4ad1276e13603b6a15704295ba6ec8</td>\n",
       "      <td>1550a3571af7e8d70a8306af72fe664e</td>\n",
       "      <td>0</td>\n",
       "      <td>f703a7e171878113d1854d9e25b1df7f</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>97664</td>\n",
       "      <td>6170</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.039863</td>\n",
       "      <td>0.133336</td>\n",
       "      <td>-0.195368</td>\n",
       "      <td>0.257092</td>\n",
       "      <td>-0.048936</td>\n",
       "      <td>0.235421</td>\n",
       "      <td>0.224185</td>\n",
       "      <td>-0.070500</td>\n",
       "      <td>0.347438</td>\n",
       "      <td>-0.006637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800010</th>\n",
       "      <td>efca04f59aa7f97409765596104e07c5</td>\n",
       "      <td>83676d17979d0f7a2dba47874ea35843</td>\n",
       "      <td>0e45009e5d5b99a9b925a6876ceb15e7</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2321</td>\n",
       "      <td>...</td>\n",
       "      <td>0.105003</td>\n",
       "      <td>-0.020891</td>\n",
       "      <td>-0.320090</td>\n",
       "      <td>-0.049073</td>\n",
       "      <td>-0.019815</td>\n",
       "      <td>0.177344</td>\n",
       "      <td>-0.056264</td>\n",
       "      <td>-0.160038</td>\n",
       "      <td>0.064474</td>\n",
       "      <td>-0.200339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800011</th>\n",
       "      <td>efca04f59aa7f97409765596104e07c5</td>\n",
       "      <td>83676d17979d0f7a2dba47874ea35843</td>\n",
       "      <td>50999680985c2e4b8e1ca38a23496571</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2321</td>\n",
       "      <td>...</td>\n",
       "      <td>0.016955</td>\n",
       "      <td>0.166202</td>\n",
       "      <td>-0.143758</td>\n",
       "      <td>0.170843</td>\n",
       "      <td>-0.087539</td>\n",
       "      <td>0.126389</td>\n",
       "      <td>0.093644</td>\n",
       "      <td>-0.104910</td>\n",
       "      <td>-0.160005</td>\n",
       "      <td>0.017439</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800012</th>\n",
       "      <td>efca04f59aa7f97409765596104e07c5</td>\n",
       "      <td>83676d17979d0f7a2dba47874ea35843</td>\n",
       "      <td>d5659c8731ab402b54651a1ae6ef36aa</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2321</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.147833</td>\n",
       "      <td>0.037293</td>\n",
       "      <td>-0.154904</td>\n",
       "      <td>0.207073</td>\n",
       "      <td>-0.132432</td>\n",
       "      <td>0.163779</td>\n",
       "      <td>0.124798</td>\n",
       "      <td>-0.087510</td>\n",
       "      <td>0.355808</td>\n",
       "      <td>-0.080707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800013</th>\n",
       "      <td>efca04f59aa7f97409765596104e07c5</td>\n",
       "      <td>83676d17979d0f7a2dba47874ea35843</td>\n",
       "      <td>4a87208053004ae71e82c4af599de242</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2321</td>\n",
       "      <td>...</td>\n",
       "      <td>0.026710</td>\n",
       "      <td>0.173829</td>\n",
       "      <td>-0.207429</td>\n",
       "      <td>0.123646</td>\n",
       "      <td>-0.069800</td>\n",
       "      <td>0.045807</td>\n",
       "      <td>0.184887</td>\n",
       "      <td>0.037751</td>\n",
       "      <td>0.192771</td>\n",
       "      <td>-0.252423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800014</th>\n",
       "      <td>efca04f59aa7f97409765596104e07c5</td>\n",
       "      <td>83676d17979d0f7a2dba47874ea35843</td>\n",
       "      <td>0cc56b6914c87d906b23e6d90cd4f65e</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2321</td>\n",
       "      <td>...</td>\n",
       "      <td>0.003277</td>\n",
       "      <td>0.081982</td>\n",
       "      <td>-0.049158</td>\n",
       "      <td>0.145481</td>\n",
       "      <td>0.019086</td>\n",
       "      <td>0.129693</td>\n",
       "      <td>0.025949</td>\n",
       "      <td>-0.244428</td>\n",
       "      <td>0.291938</td>\n",
       "      <td>0.099707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800015</th>\n",
       "      <td>efca3e6360e4933fb329d5e6115492bd</td>\n",
       "      <td>33c64b79ac11b37f451f6b70c94bc0be</td>\n",
       "      <td>50999680985c2e4b8e1ca38a23496571</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>0.016955</td>\n",
       "      <td>0.166202</td>\n",
       "      <td>-0.143758</td>\n",
       "      <td>0.170843</td>\n",
       "      <td>-0.087539</td>\n",
       "      <td>0.126389</td>\n",
       "      <td>0.093644</td>\n",
       "      <td>-0.104910</td>\n",
       "      <td>-0.160005</td>\n",
       "      <td>0.017439</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800016</th>\n",
       "      <td>efca3e6360e4933fb329d5e6115492bd</td>\n",
       "      <td>33c64b79ac11b37f451f6b70c94bc0be</td>\n",
       "      <td>a06f451ac97e416a3e23dc83d779b450</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.002639</td>\n",
       "      <td>0.103352</td>\n",
       "      <td>-0.035890</td>\n",
       "      <td>0.182418</td>\n",
       "      <td>-0.146269</td>\n",
       "      <td>-0.009281</td>\n",
       "      <td>0.258288</td>\n",
       "      <td>-0.056532</td>\n",
       "      <td>0.179102</td>\n",
       "      <td>-0.105327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800017</th>\n",
       "      <td>efca3e6360e4933fb329d5e6115492bd</td>\n",
       "      <td>33c64b79ac11b37f451f6b70c94bc0be</td>\n",
       "      <td>0efd54bde984591c3d18a3171c46885b</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.261602</td>\n",
       "      <td>0.202715</td>\n",
       "      <td>-0.179653</td>\n",
       "      <td>-0.115316</td>\n",
       "      <td>-0.092948</td>\n",
       "      <td>0.282867</td>\n",
       "      <td>0.218535</td>\n",
       "      <td>0.278222</td>\n",
       "      <td>0.114994</td>\n",
       "      <td>-0.165356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800018</th>\n",
       "      <td>efca3e6360e4933fb329d5e6115492bd</td>\n",
       "      <td>33c64b79ac11b37f451f6b70c94bc0be</td>\n",
       "      <td>ca059df494c97c399d5a235196713d04</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>0.143645</td>\n",
       "      <td>0.154503</td>\n",
       "      <td>-0.077997</td>\n",
       "      <td>0.240759</td>\n",
       "      <td>0.102560</td>\n",
       "      <td>0.219050</td>\n",
       "      <td>0.015812</td>\n",
       "      <td>-0.076289</td>\n",
       "      <td>0.110664</td>\n",
       "      <td>-0.170317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>800019</th>\n",
       "      <td>efca3e6360e4933fb329d5e6115492bd</td>\n",
       "      <td>33c64b79ac11b37f451f6b70c94bc0be</td>\n",
       "      <td>dd4ad1276e13603b6a15704295ba6ec8</td>\n",
       "      <td>0</td>\n",
       "      <td>644de3dce40c823149df609c0dde5b6d</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>59</td>\n",
       "      <td>...</td>\n",
       "      <td>0.155981</td>\n",
       "      <td>0.232841</td>\n",
       "      <td>-0.306172</td>\n",
       "      <td>0.221174</td>\n",
       "      <td>-0.260373</td>\n",
       "      <td>0.028966</td>\n",
       "      <td>0.115172</td>\n",
       "      <td>-0.143052</td>\n",
       "      <td>0.062720</td>\n",
       "      <td>-0.031508</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>20 rows × 100 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     did                               vid  \\\n",
       "800000  efc919c0505fffdc63c6a3081f6f6226  6dca3fdb207e0587150d1fd0d3860307   \n",
       "800001  efc919c0505fffdc63c6a3081f6f6226  6dca3fdb207e0587150d1fd0d3860307   \n",
       "800002  efc919c0505fffdc63c6a3081f6f6226  6dca3fdb207e0587150d1fd0d3860307   \n",
       "800003  efc919c0505fffdc63c6a3081f6f6226  6dca3fdb207e0587150d1fd0d3860307   \n",
       "800004  efc919c0505fffdc63c6a3081f6f6226  6dca3fdb207e0587150d1fd0d3860307   \n",
       "800005  efc9d9350ad48b55aaf2f0e9bf40f851  dd4ad1276e13603b6a15704295ba6ec8   \n",
       "800006  efc9d9350ad48b55aaf2f0e9bf40f851  dd4ad1276e13603b6a15704295ba6ec8   \n",
       "800007  efc9d9350ad48b55aaf2f0e9bf40f851  dd4ad1276e13603b6a15704295ba6ec8   \n",
       "800008  efc9d9350ad48b55aaf2f0e9bf40f851  dd4ad1276e13603b6a15704295ba6ec8   \n",
       "800009  efc9d9350ad48b55aaf2f0e9bf40f851  dd4ad1276e13603b6a15704295ba6ec8   \n",
       "800010  efca04f59aa7f97409765596104e07c5  83676d17979d0f7a2dba47874ea35843   \n",
       "800011  efca04f59aa7f97409765596104e07c5  83676d17979d0f7a2dba47874ea35843   \n",
       "800012  efca04f59aa7f97409765596104e07c5  83676d17979d0f7a2dba47874ea35843   \n",
       "800013  efca04f59aa7f97409765596104e07c5  83676d17979d0f7a2dba47874ea35843   \n",
       "800014  efca04f59aa7f97409765596104e07c5  83676d17979d0f7a2dba47874ea35843   \n",
       "800015  efca3e6360e4933fb329d5e6115492bd  33c64b79ac11b37f451f6b70c94bc0be   \n",
       "800016  efca3e6360e4933fb329d5e6115492bd  33c64b79ac11b37f451f6b70c94bc0be   \n",
       "800017  efca3e6360e4933fb329d5e6115492bd  33c64b79ac11b37f451f6b70c94bc0be   \n",
       "800018  efca3e6360e4933fb329d5e6115492bd  33c64b79ac11b37f451f6b70c94bc0be   \n",
       "800019  efca3e6360e4933fb329d5e6115492bd  33c64b79ac11b37f451f6b70c94bc0be   \n",
       "\n",
       "                                vid_next  label  \\\n",
       "800000  31e2a3c49a123ae3acb130828881ad76      0   \n",
       "800001  388d453f41a9275830fd47a5a9bc1023      1   \n",
       "800002  0cc56b6914c87d906b23e6d90cd4f65e      0   \n",
       "800003  f2cc06a8d6fa7513a7597e02808e5c7e      0   \n",
       "800004  3be18c575207dd28d57ba7adc99a3b0a      0   \n",
       "800005  7d697b54355da49d2a6949e9969bf828      0   \n",
       "800006  0cc56b6914c87d906b23e6d90cd4f65e      1   \n",
       "800007  8d0a739e77f24c5231c40f19c8c36dee      0   \n",
       "800008  114badee091c5d3faab0f620e608d6c0      0   \n",
       "800009  1550a3571af7e8d70a8306af72fe664e      0   \n",
       "800010  0e45009e5d5b99a9b925a6876ceb15e7      0   \n",
       "800011  50999680985c2e4b8e1ca38a23496571      0   \n",
       "800012  d5659c8731ab402b54651a1ae6ef36aa      0   \n",
       "800013  4a87208053004ae71e82c4af599de242      0   \n",
       "800014  0cc56b6914c87d906b23e6d90cd4f65e      0   \n",
       "800015  50999680985c2e4b8e1ca38a23496571      0   \n",
       "800016  a06f451ac97e416a3e23dc83d779b450      0   \n",
       "800017  0efd54bde984591c3d18a3171c46885b      0   \n",
       "800018  ca059df494c97c399d5a235196713d04      0   \n",
       "800019  dd4ad1276e13603b6a15704295ba6ec8      0   \n",
       "\n",
       "                                     cid  is_intact  serialno  classify_id  \\\n",
       "800000  04dedb2a4e44c26542676ad1fc74d9ef          3        22            2   \n",
       "800001  04dedb2a4e44c26542676ad1fc74d9ef          3        22            2   \n",
       "800002  04dedb2a4e44c26542676ad1fc74d9ef          3        22            2   \n",
       "800003  04dedb2a4e44c26542676ad1fc74d9ef          3        22            2   \n",
       "800004  04dedb2a4e44c26542676ad1fc74d9ef          3        22            2   \n",
       "800005  f703a7e171878113d1854d9e25b1df7f          1         2            1   \n",
       "800006  f703a7e171878113d1854d9e25b1df7f          1         2            1   \n",
       "800007  f703a7e171878113d1854d9e25b1df7f          1         2            1   \n",
       "800008  f703a7e171878113d1854d9e25b1df7f          1         2            1   \n",
       "800009  f703a7e171878113d1854d9e25b1df7f          1         2            1   \n",
       "800010  644de3dce40c823149df609c0dde5b6d          1        13            1   \n",
       "800011  644de3dce40c823149df609c0dde5b6d          1        13            1   \n",
       "800012  644de3dce40c823149df609c0dde5b6d          1        13            1   \n",
       "800013  644de3dce40c823149df609c0dde5b6d          1        13            1   \n",
       "800014  644de3dce40c823149df609c0dde5b6d          1        13            1   \n",
       "800015  644de3dce40c823149df609c0dde5b6d          3        11            1   \n",
       "800016  644de3dce40c823149df609c0dde5b6d          3        11            1   \n",
       "800017  644de3dce40c823149df609c0dde5b6d          3        11            1   \n",
       "800018  644de3dce40c823149df609c0dde5b6d          3        11            1   \n",
       "800019  644de3dce40c823149df609c0dde5b6d          3        11            1   \n",
       "\n",
       "        series_id  duration  ...        62        63        64        65  \\\n",
       "800000          0        81  ...  0.089897  0.066589  0.042069  0.002364   \n",
       "800001          0        81  ...  0.027210  0.176136 -0.148914  0.321954   \n",
       "800002          0        81  ...  0.003277  0.081982 -0.049158  0.145481   \n",
       "800003          0        81  ...  0.232113  0.259245  0.015874  0.128125   \n",
       "800004          0        81  ... -0.001586  0.091517 -0.086195  0.093147   \n",
       "800005      97664      6170  ... -0.185058  0.054995 -0.143772  0.198850   \n",
       "800006      97664      6170  ...  0.003277  0.081982 -0.049158  0.145481   \n",
       "800007      97664      6170  ...  0.088733  0.111608 -0.010785  0.379843   \n",
       "800008      97664      6170  ...  0.112283  0.148375 -0.156462  0.330316   \n",
       "800009      97664      6170  ... -0.039863  0.133336 -0.195368  0.257092   \n",
       "800010          0      2321  ...  0.105003 -0.020891 -0.320090 -0.049073   \n",
       "800011          0      2321  ...  0.016955  0.166202 -0.143758  0.170843   \n",
       "800012          0      2321  ... -0.147833  0.037293 -0.154904  0.207073   \n",
       "800013          0      2321  ...  0.026710  0.173829 -0.207429  0.123646   \n",
       "800014          0      2321  ...  0.003277  0.081982 -0.049158  0.145481   \n",
       "800015          0        59  ...  0.016955  0.166202 -0.143758  0.170843   \n",
       "800016          0        59  ... -0.002639  0.103352 -0.035890  0.182418   \n",
       "800017          0        59  ... -0.261602  0.202715 -0.179653 -0.115316   \n",
       "800018          0        59  ...  0.143645  0.154503 -0.077997  0.240759   \n",
       "800019          0        59  ...  0.155981  0.232841 -0.306172  0.221174   \n",
       "\n",
       "              66        67        68        69        70        71  \n",
       "800000 -0.023163  0.000433  0.151972 -0.030058  0.221805 -0.145798  \n",
       "800001 -0.108836  0.274061  0.280797 -0.091574  0.107974  0.158113  \n",
       "800002  0.019086  0.129693  0.025949 -0.244428  0.291938  0.099707  \n",
       "800003  0.272246  0.147186 -0.018089  0.122752  0.378198 -0.145448  \n",
       "800004  0.026626  0.288764 -0.024857 -0.125218  0.274347 -0.240868  \n",
       "800005 -0.189167  0.191430  0.276149  0.028904 -0.043255 -0.009367  \n",
       "800006  0.019086  0.129693  0.025949 -0.244428  0.291938  0.099707  \n",
       "800007 -0.161900  0.209395 -0.054082 -0.211781 -0.084917  0.101512  \n",
       "800008 -0.129281  0.022817 -0.043506 -0.016560  0.013073 -0.156181  \n",
       "800009 -0.048936  0.235421  0.224185 -0.070500  0.347438 -0.006637  \n",
       "800010 -0.019815  0.177344 -0.056264 -0.160038  0.064474 -0.200339  \n",
       "800011 -0.087539  0.126389  0.093644 -0.104910 -0.160005  0.017439  \n",
       "800012 -0.132432  0.163779  0.124798 -0.087510  0.355808 -0.080707  \n",
       "800013 -0.069800  0.045807  0.184887  0.037751  0.192771 -0.252423  \n",
       "800014  0.019086  0.129693  0.025949 -0.244428  0.291938  0.099707  \n",
       "800015 -0.087539  0.126389  0.093644 -0.104910 -0.160005  0.017439  \n",
       "800016 -0.146269 -0.009281  0.258288 -0.056532  0.179102 -0.105327  \n",
       "800017 -0.092948  0.282867  0.218535  0.278222  0.114994 -0.165356  \n",
       "800018  0.102560  0.219050  0.015812 -0.076289  0.110664 -0.170317  \n",
       "800019 -0.260373  0.028966  0.115172 -0.143052  0.062720 -0.031508  \n",
       "\n",
       "[20 rows x 100 columns]"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_data[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "id": "b9890d29",
   "metadata": {},
   "outputs": [],
   "source": [
    "useless_cols = ['did', 'vid_next', 'vid', 'label', 'cid', 'upgc_flag', 'preds_rank',\n",
    "                    'vts', 'series_id', 'key_word', 'stars', 'tags','fvid_cid',\n",
    "               'is_intact','serialno','classify_id','duration','title_length','fvid_serialno','fvid_duration']\n",
    "features = train_data.columns[~train_data.columns.isin(useless_cols)].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "id": "7fd2083a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['fvid_vid_same_cid', 'update_vid', 'update_vid_abs_1',\n",
       "       'update_vid_abs_2', 'update_vid_abs_3', 'update_vid_abs_5',\n",
       "       'vid_last_click_time_diff', 'nunique_click_vid_next_did',\n",
       "       'nunique_click_vid_did', 'nunique_click_cid_did', 0, 1, 2, 3, 4, 5,\n",
       "       6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,\n",
       "       24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,\n",
       "       41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,\n",
       "       58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 183,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "id": "a07b442a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fvid_vid_same_cid</th>\n",
       "      <th>update_vid</th>\n",
       "      <th>update_vid_abs_1</th>\n",
       "      <th>update_vid_abs_2</th>\n",
       "      <th>update_vid_abs_3</th>\n",
       "      <th>update_vid_abs_5</th>\n",
       "      <th>vid_last_click_time_diff</th>\n",
       "      <th>nunique_click_vid_next_did</th>\n",
       "      <th>nunique_click_vid_did</th>\n",
       "      <th>nunique_click_cid_did</th>\n",
       "      <th>...</th>\n",
       "      <th>62</th>\n",
       "      <th>63</th>\n",
       "      <th>64</th>\n",
       "      <th>65</th>\n",
       "      <th>66</th>\n",
       "      <th>67</th>\n",
       "      <th>68</th>\n",
       "      <th>69</th>\n",
       "      <th>70</th>\n",
       "      <th>71</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>37289</td>\n",
       "      <td>15004</td>\n",
       "      <td>59779</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.148953</td>\n",
       "      <td>0.025655</td>\n",
       "      <td>-0.192339</td>\n",
       "      <td>0.086821</td>\n",
       "      <td>-0.152115</td>\n",
       "      <td>0.200692</td>\n",
       "      <td>0.188146</td>\n",
       "      <td>0.204941</td>\n",
       "      <td>0.185622</td>\n",
       "      <td>-0.000784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6139</td>\n",
       "      <td>15004</td>\n",
       "      <td>59779</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.220604</td>\n",
       "      <td>0.080280</td>\n",
       "      <td>-0.259195</td>\n",
       "      <td>0.085699</td>\n",
       "      <td>-0.262640</td>\n",
       "      <td>0.251884</td>\n",
       "      <td>0.239955</td>\n",
       "      <td>0.000670</td>\n",
       "      <td>0.080673</td>\n",
       "      <td>-0.098001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>14120</td>\n",
       "      <td>15004</td>\n",
       "      <td>59779</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.185909</td>\n",
       "      <td>0.265205</td>\n",
       "      <td>0.111438</td>\n",
       "      <td>0.106111</td>\n",
       "      <td>-0.004808</td>\n",
       "      <td>0.060637</td>\n",
       "      <td>0.297780</td>\n",
       "      <td>0.234480</td>\n",
       "      <td>0.142350</td>\n",
       "      <td>-0.194903</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 82 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   fvid_vid_same_cid  update_vid  update_vid_abs_1  update_vid_abs_2  \\\n",
       "0                1.0         1.0               0.0               1.0   \n",
       "1                0.0         0.0               0.0               0.0   \n",
       "2                1.0         1.0               1.0               1.0   \n",
       "\n",
       "   update_vid_abs_3  update_vid_abs_5  vid_last_click_time_diff  \\\n",
       "0               1.0               1.0                       0.0   \n",
       "1               0.0               0.0                       0.0   \n",
       "2               1.0               1.0                       0.0   \n",
       "\n",
       "   nunique_click_vid_next_did  nunique_click_vid_did  nunique_click_cid_did  \\\n",
       "0                       37289                  15004                  59779   \n",
       "1                        6139                  15004                  59779   \n",
       "2                       14120                  15004                  59779   \n",
       "\n",
       "   ...        62        63        64        65        66        67        68  \\\n",
       "0  ... -0.148953  0.025655 -0.192339  0.086821 -0.152115  0.200692  0.188146   \n",
       "1  ... -0.220604  0.080280 -0.259195  0.085699 -0.262640  0.251884  0.239955   \n",
       "2  ... -0.185909  0.265205  0.111438  0.106111 -0.004808  0.060637  0.297780   \n",
       "\n",
       "         69        70        71  \n",
       "0  0.204941  0.185622 -0.000784  \n",
       "1  0.000670  0.080673 -0.098001  \n",
       "2  0.234480  0.142350 -0.194903  \n",
       "\n",
       "[3 rows x 82 columns]"
      ]
     },
     "execution_count": 184,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data[features][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "id": "1f7f4f90",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(80586, 800000)"
      ]
     },
     "execution_count": 185,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_data['label'][train_data['label']==1]),len(train_data['label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "id": "70faa069",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "fvid_vid_same_cid    float64\n",
       "update_vid           float64\n",
       "update_vid_abs_1     float64\n",
       "update_vid_abs_2     float64\n",
       "update_vid_abs_3     float64\n",
       "                      ...   \n",
       "67                   float64\n",
       "68                   float64\n",
       "69                   float64\n",
       "70                   float64\n",
       "71                   float64\n",
       "Length: 82, dtype: object"
      ]
     },
     "execution_count": 186,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data[features].dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "id": "c5fa58ff",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/anaconda3/envs/uie_py38/lib/python3.8/site-packages/lightgbm/engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.\n",
      "  _log_warning(\"'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. \"\n",
      "/root/anaconda3/envs/uie_py38/lib/python3.8/site-packages/lightgbm/engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.\n",
      "  _log_warning(\"'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[LightGBM] [Warning] min_data_in_leaf is set=15, min_child_samples=15 will be ignored. Current value: min_data_in_leaf=15\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "Early stopping, best iteration is:\n",
      "[208]\ttraining's auc: 0.845735\tvalid_1's auc: 0.820844\n"
     ]
    }
   ],
   "source": [
    "trn_data = lgb.Dataset(\n",
    "        train_data[features], label=train_data['label'].values)\n",
    "val_data = lgb.Dataset(\n",
    "        valid_data[features], label=valid_data['label'].values)\n",
    "params = {\n",
    "        'num_leaves': 64, #31\n",
    "        'min_data_in_leaf': 15, # 30 \n",
    "        'objective':'binary',\n",
    "        'max_depth': -1,\n",
    "        'learning_rate': 0.1,\n",
    "        \"min_child_samples\": 15,\n",
    "        \"boosting\": \"gbdt\",\n",
    "        \"feature_fraction\": 0.8,\n",
    "        \"bagging_freq\": 1,\n",
    "        \"bagging_fraction\": 0.9 ,\n",
    "        \"bagging_seed\": 11,\n",
    "        \"metric\": 'auc',\n",
    "        \"lambda_l1\": 0.1,\n",
    "        \"verbosity\": -1,\n",
    "        \"nthread\": 23,\n",
    "        \"random_state\": 4590,\n",
    "    }\n",
    "clf = lgb.train(params,\n",
    "                    trn_data,\n",
    "                    valid_sets=[trn_data, val_data],\n",
    "                    num_boost_round=10000,\n",
    "                    verbose_eval=500,\n",
    "                    early_stopping_rounds=100)  # , feval=self_gauc)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "e09dbc67",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_data['preds'] = clf.predict(test_data[features], num_iteration=clf.best_iteration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "458943f3",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>did</th>\n",
       "      <th>vid</th>\n",
       "      <th>vid_next</th>\n",
       "      <th>label</th>\n",
       "      <th>cid</th>\n",
       "      <th>is_intact</th>\n",
       "      <th>serialno</th>\n",
       "      <th>classify_id</th>\n",
       "      <th>series_id</th>\n",
       "      <th>duration</th>\n",
       "      <th>...</th>\n",
       "      <th>update_vid</th>\n",
       "      <th>update_vid_abs_1</th>\n",
       "      <th>update_vid_abs_2</th>\n",
       "      <th>update_vid_abs_3</th>\n",
       "      <th>update_vid_abs_5</th>\n",
       "      <th>vid_last_click_time_diff</th>\n",
       "      <th>nunique_click_vid_next_did</th>\n",
       "      <th>nunique_click_vid_did</th>\n",
       "      <th>nunique_click_cid_did</th>\n",
       "      <th>preds</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>2f8899ad36408f3317612fcbf1e7f2c6</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>488</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.040259</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>247c0febd952461cb7c7907d30a2c6ef</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>1267</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.048416</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>5faba9f1a020e6409c67231ed54330f5</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>1533</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.044119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>ad7dc1e1c178d3348da07e30b219447b</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>871</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.025241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>0cc56b6914c87d906b23e6d90cd4f65e</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>75311</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.078844</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>44c640088903dbc7126d0d3271f2db5a</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>2737</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.057057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>db978f94b983a6ed3021e1e927ee09ef</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>34534</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.032822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>cb5d6f075ed6aa4e838fec765c72cb8e</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>156</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.023749</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>4d5f8fa9ec5fc9f62e650b07f72eb71a</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>1768</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.036754</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>0000d0aabe8c188f88c756ce0f7f9639</td>\n",
       "      <td>31c50bb6df1a3087c29f43d9cfda0197</td>\n",
       "      <td>4b0deff6de7ef9e488269aae5286d510</td>\n",
       "      <td>0</td>\n",
       "      <td>2d8b036894dfcc6fd909003ea9c1641c</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>70</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>11636.0</td>\n",
       "      <td>30414</td>\n",
       "      <td>3923</td>\n",
       "      <td>18771</td>\n",
       "      <td>0.062382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                did                               vid  \\\n",
       "0  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "1  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "2  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "3  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "4  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "5  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "6  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "7  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "8  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "9  0000d0aabe8c188f88c756ce0f7f9639  31c50bb6df1a3087c29f43d9cfda0197   \n",
       "\n",
       "                           vid_next  label                               cid  \\\n",
       "0  2f8899ad36408f3317612fcbf1e7f2c6      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "1  247c0febd952461cb7c7907d30a2c6ef      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "2  5faba9f1a020e6409c67231ed54330f5      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "3  ad7dc1e1c178d3348da07e30b219447b      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "4  0cc56b6914c87d906b23e6d90cd4f65e      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "5  44c640088903dbc7126d0d3271f2db5a      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "6  db978f94b983a6ed3021e1e927ee09ef      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "7  cb5d6f075ed6aa4e838fec765c72cb8e      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "8  4d5f8fa9ec5fc9f62e650b07f72eb71a      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "9  4b0deff6de7ef9e488269aae5286d510      0  2d8b036894dfcc6fd909003ea9c1641c   \n",
       "\n",
       "   is_intact  serialno  classify_id  series_id  duration  ...  update_vid  \\\n",
       "0          1         1            2          0        70  ...         0.0   \n",
       "1          1         1            2          0        70  ...         0.0   \n",
       "2          1         1            2          0        70  ...         0.0   \n",
       "3          1         1            2          0        70  ...         0.0   \n",
       "4          1         1            2          0        70  ...         0.0   \n",
       "5          1         1            2          0        70  ...         0.0   \n",
       "6          1         1            2          0        70  ...         0.0   \n",
       "7          1         1            2          0        70  ...         0.0   \n",
       "8          1         1            2          0        70  ...         0.0   \n",
       "9          1         1            2          0        70  ...         0.0   \n",
       "\n",
       "   update_vid_abs_1 update_vid_abs_2 update_vid_abs_3 update_vid_abs_5  \\\n",
       "0               1.0              1.0              1.0              1.0   \n",
       "1               1.0              1.0              1.0              1.0   \n",
       "2               1.0              1.0              1.0              1.0   \n",
       "3               1.0              1.0              1.0              1.0   \n",
       "4               0.0              0.0              0.0              0.0   \n",
       "5               0.0              0.0              0.0              0.0   \n",
       "6               0.0              0.0              0.0              0.0   \n",
       "7               0.0              0.0              0.0              0.0   \n",
       "8               0.0              0.0              0.0              0.0   \n",
       "9               0.0              0.0              0.0              0.0   \n",
       "\n",
       "  vid_last_click_time_diff  nunique_click_vid_next_did  nunique_click_vid_did  \\\n",
       "0                  11636.0                         488                   3923   \n",
       "1                  11636.0                        1267                   3923   \n",
       "2                  11636.0                        1533                   3923   \n",
       "3                  11636.0                         871                   3923   \n",
       "4                  11636.0                       75311                   3923   \n",
       "5                  11636.0                        2737                   3923   \n",
       "6                  11636.0                       34534                   3923   \n",
       "7                  11636.0                         156                   3923   \n",
       "8                  11636.0                        1768                   3923   \n",
       "9                  11636.0                       30414                   3923   \n",
       "\n",
       "   nunique_click_cid_did     preds  \n",
       "0                  18771  0.040259  \n",
       "1                  18771  0.048416  \n",
       "2                  18771  0.044119  \n",
       "3                  18771  0.025241  \n",
       "4                  18771  0.078844  \n",
       "5                  18771  0.057057  \n",
       "6                  18771  0.032822  \n",
       "7                  18771  0.023749  \n",
       "8                  18771  0.036754  \n",
       "9                  18771  0.062382  \n",
       "\n",
       "[10 rows x 29 columns]"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_data[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "12dad6d0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                did                          vid_next  \\\n",
      "0  ffff856232794b1cff1baea25bc25786  ae33bc830f425989650b45e9bf545d9e   \n",
      "1  ffff856232794b1cff1baea25bc25786  4a87208053004ae71e82c4af599de242   \n",
      "2  ffff856232794b1cff1baea25bc25786  0cc56b6914c87d906b23e6d90cd4f65e   \n",
      "3  ffff856232794b1cff1baea25bc25786  1237114f0d07f3e17e19f6fd5c3fe3bd   \n",
      "4  ffff856232794b1cff1baea25bc25786  0c6ff0068e0648b7436919ed5a37202f   \n",
      "\n",
      "      preds  \n",
      "0  0.445193  \n",
      "1  0.091985  \n",
      "2  0.079894  \n",
      "3  0.072589  \n",
      "4  0.062424  \n"
     ]
    }
   ],
   "source": [
    "test_data = test_data.sort_values(\n",
    "        by=['did',  'preds','vid_next'], ascending=False).reset_index(drop=True)\n",
    "print(test_data[['did', 'vid_next','preds']][0:5])\n",
    "test_data['preds_rank'] = test_data.groupby(\n",
    "        ['did']).cumcount() + 1\n",
    "test_solution = test_data[test_data['preds_rank']<=6][['did', 'vid_next', 'preds_rank']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "9f78fd61",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_solution.columns = ['did', 'vid', 'rank']\n",
    "test_solution.to_csv('./video_predict_data/res_rule_0818_1517.csv',index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c67aeb8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:uie_py38]",
   "language": "python",
   "name": "conda-env-uie_py38-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
